---
title: 
  | 
  | Replication Material
  | 
  | What Makes Policy Complex? 
  |
author: "Roman Senninger"
date: "Contact: rsenninger@ps.au.dk"
output: 
  bookdown::html_document2:
    theme: cosmo
    highlight: kate
    toc: true
    number_sections: false
    toc_float: 
     collapsed: true
     smooth_scroll: true
    code_folding: hide
---

```{r setup, include = F}
knitr::opts_knit$set(root.dir = '/Users/au506709/Dropbox/Papers/Measuring_Complexity/data/replication_files/data/')
knitr::opts_chunk$set(cache = T)
set.seed(123)
```

# Link to Paper

Pre-print: https://osf.io/preprints/socarxiv/qa5ug/ 

# Data 

This manual presents code to reproduce the results in the main body and the supplementary materials. Please visit the PSRM dataverse to download the data used in this manual.   

# Load Required Packages

```{r, eval = T, echo = T, include= T, message= FALSE}


packages <- c("BradleyTerry2", "corrplot", "plyr", "quanteda", "randomForest", "readstata13", "spacyr", "tidytext", "ggplot2", "quanteda.textstats", "quanteda.textplots", "caret", "flexmix")

lapply(packages, library, character.only = TRUE)


```

# Session Info
```{r, eval = T, echo = T, include= T, message= FALSE}

sessionInfo()
```


# Table 2, Table SI 3, Table SI 4

```{r, eval = T, echo = T, include=TRUE, warning=F}

# load dataset 
load("./list.RData")

#run bias-reduced structured model

textModel_str1 <- BTm(1, easier, harder, id = "ID",  ~  Flesch[ID],
                      data = list, br = T)


textModel_str2 <- BTm(1, easier, harder, id = "ID",  ~  meanSentenceChars[ID] + n_token[ID],
                       data = list, br = T)


textModel_str3 <- BTm(1, easier, harder, id = "ID",  ~  meanSentenceChars[ID] + n_token[ID] +
                         reg_ref_count[ID],
                       data = list, br = T)

textModel_str4 <- BTm(1, easier, harder, id = "ID",  ~  Flesch[ID] + reg_ref_count[ID],
                      data = list, br = T)


# AIC's
summary(textModel_str1)
summary(textModel_str2)
summary(textModel_str3)
summary(textModel_str4)

# BIC
BIC(textModel_str1)
BIC(textModel_str2)
BIC(textModel_str3)
BIC(textModel_str4)

# Accuracy in Table 2

# function for fit (percent corr predicted)
prop.correct <- function(x = BTFRE) { 
  sum(predict(x, type = "response") > .5) / length(predict(x, type = "response"))
}

prop.correct(textModel_str1)/0.7817519
prop.correct(textModel_str2)/0.7817519
prop.correct(textModel_str3)/0.7817519
prop.correct(textModel_str4)/0.7817519


```


# Table 3

```{r, eval = T, echo = T, include=TRUE, warning=F}

# load dataset 
load("./tab3_1.RData")

train.control <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

modelcv1 <- train(ratio_combined ~ meanFlesch, data = df_sub, 
                  method = "lm", trControl = train.control)

print(modelcv1)


modelcv2 <- train(ratio_combined ~ n + meanSentenceChars + countWords, data = df_sub, method = "lm", trControl = train.control)

print(modelcv2)


# load dataset 
load("./tab3_2.RData")

modelcv1 <- train(ratio_combined ~ RECIT, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv1)

modelcv2 <- train(ratio_combined ~ meanSentenceChars + n + countWords, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv2)


# load dataset 
load("./tab3_3.RData")

modelcv1 <- train(ratio_combined ~ recitals, data = mm,
                  method = "lm",
                  trControl = train.control)

print(modelcv1)

modelcv2 <- train(ratio_combined ~ meanSentenceChars + n + countWords, data = mm, 
                  method = "lm",
                  trControl = train.control)

print(modelcv2)


```

# Figure SI 1 

```{r, eval = T, echo = T, include=TRUE, fig.width= 3, fig.height= 3, fig.align = "center", out.width="30%", warning=F}

# load dataset 
load("./fig_SI1.RData")


# left panel

hist(epsr$RECIT , breaks=seq(0,110,by=5) , col="gray70", border=F , main="" , xlab="Recitals", ylab = "", xlim=c(0,110), ylim=c(0,200))
text(60, 125, "Source: \nSteunenberg and \nRhinard 2010 \nN=317", cex = 0.8)


# right panel

hist(cps$recitals , breaks=seq(0,110,by=5) , col="gray70", border=F , main="" , ylab = "", xlab="Recitals", xlim=c(0,110), ylim=c(0,200))
text(60, 125, "Source: \nReh et al 2013 \nN=797", cex = 0.8)


```

# Table SI 2

```{r, eval = T, echo = T, include=TRUE, fig.width= 3, warning=F}

# load dataset 
load("./list.RData")
options("scipen"=999)

covariates <- c("n_token", "meanWordChars", "meanSentenceChars", "google_min_2000", "reg_ref", "reg_ref_count", "abr", "abr_count", "whereas", "Flesch")

#subset covariates for table
table <- list$data[covariates]

# Calculate the statistics for each covariate
stats <- sapply(table[, covariates], function(x) c(mean = mean(x), sd = sd(x), min = min(x), max = max(x)))

# Create the summary table
summary_table <- as.data.frame(stats)

# Print the summary table
print(summary_table)


```

# Figure SI 4 

```{r, eval = T, echo = T, include=TRUE, fig.width= 6, fig.height= 6, fig.align = "center", out.width="50%", warning=F}

# load dataset 
load("./list.RData")

terms <- c("n_token", "meanWordChars", "meanSentenceChars", "google_min_2000", "reg_ref", "reg_ref_count", "abr", "abr_count", "whereas", "Flesch")


X <- list$data[terms]

#corrplot 
corX <- cor(X)
corrplot(corX, method="circle", type="upper")

```

# Figure SI 5 

```{r, eval = T, echo = T, include=TRUE, fig.width= 8, fig.height= 6, fig.align = "center", out.width="50%", warning=F}

# load dataset 
load("./list.RData")
load("./BT_unstruc_brT_list.rda")


# run bias-reduced unstructured model
# running time about 11 minutes (model output therefore attached and loaded)
#BT_unstruc_brT_list <- BTm(1, easier, harder,
#                      data = list, br = TRUE)

#random forest 
y <- as.data.frame(BTabilities(BT_unstruc_brT_list)[, "ability"])
names(y) <- c("easiness")
y <- subset(y, easiness != "NA") #remove NAs
y$ID <- rownames(y)
list$data <- merge(list$data, y, by = "ID")
list$data <- list$data[ order(list$data$ID),]


# collect the terms
terms <- c("meanWordChars", "meanSentenceChars",  "google_min_2000", 
           "reg_ref", "n_token", "whereas", "abr", "abr_count", "reg_ref_count", "Flesch")


X <- list$data[terms]
y <- list$data$easiness
mod <- randomForest(X, y = y, ntree = 1000, importance = TRUE, mtry = 3)

importance(mod)
varImpPlot(mod, main = "", pch = 16, type = 1)


```

# Table SI 5

```{r, eval = T, echo = T, include=TRUE, warning=F}

# load dataset 
load("./tab3_1.RData")


train.control <- trainControl(method = "repeatedcv", number = 10, repeats = 5)


modelcv3 <- train(delegation.ratio ~ meanFlesch, data = df_sub, 
                  method = "lm",
                  trControl = train.control)

print(modelcv3)


modelcv4 <- train(delegation.ratio ~ n + meanSentenceChars + countWords, data = df_sub, 
                  method = "lm",
                  trControl = train.control)

print(modelcv4)

modelcv1 <- train(delegation.ratiocom ~ meanFlesch, data = df_sub, 
                  method = "lm",
                  trControl = train.control)

print(modelcv1)


modelcv2 <- train(delegation.ratiocom ~ n + meanSentenceChars + countWords, data = df_sub, 
               method = "lm",
               trControl = train.control)

print(modelcv2)


```

# Table SI 6

```{r, eval = T, echo = T, include=TRUE,  warning=F}

# load dataset 
load("./tab3_2.RData")

train.control <- trainControl(method = "repeatedcv", number = 10, repeats = 5)


modelcv3 <- train(delegation.ratio ~ RECIT, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv3)

modelcv4 <- train(delegation.ratio ~ meanSentenceChars + n + countWords, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv4)

modelcv1 <- train(delegation.ratiocom ~ RECIT, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv1)

modelcv2 <- train(delegation.ratiocom ~ meanSentenceChars + n + countWords, data = epsr, 
                  method = "lm",
                  trControl = train.control)

print(modelcv2)

```

# Table SI 7

```{r, eval = T, echo = T, include=TRUE, warning=F}

# load dataset 
load("./tab3_3.RData")


modelcv3 <- train(delegation.ratio ~ recitals, data = mm,
                  method = "lm",
                  trControl = train.control)

print(modelcv3)

modelcv4 <- train(delegation.ratio ~ meanSentenceChars + n + countWords, data = mm, 
                  method = "lm",
                  trControl = train.control)

print(modelcv4)


modelcv1 <- train(delegation.ratiocom ~ recitals, data = mm,
                  method = "lm",
                  trControl = train.control)

print(modelcv1)


modelcv2 <- train(delegation.ratiocom ~ meanSentenceChars + n + countWords, data = mm, 
                  method = "lm",
                  trControl = train.control)

print(modelcv2)


```

# Figure SI 9 

```{r, eval = T, echo = T, include=TRUE, fig.width= 6, fig.height= 6, fig.align = "center", out.width="50%", warning=F}

# load dataset 
load("./keyness.RData")

keycorpus <- corpus(df)

keydfm <- dfm(keycorpus, groups = df$group, tolower = TRUE,  stem = TRUE,
              remove = stopwords("english"), remove_numbers = TRUE, remove_punct = TRUE,
              remove_symbols = TRUE)

keyness <- textstat_keyness(keydfm, target = "easy")

textplot_keyness(keyness, margin = 0.2, n = 15, color = c("lightblue", "gray60"), show_legend = T) + theme(legend.position="top") 

```

